/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.quality; import java.io.*; import java.util.*; import net.nutch.searcher.*; import net.nutch.quality.dynamic.*; /********************************************************** * The URLInsetTester takes a description of a search engine * and a list of URLs. (The search engine can be an external * one or a local Nutch index.) We go through every URL in * the list, and figure out whether the search engine in * question has indexed the URL. We emit a binary file * that lists each URL, plus a "true" or "false" answer. * * @author Mike Cafarella **********************************************************/ public class URLInsetTester { boolean debug = false; PageExtractor.IExtractor extractor; /** * Take a PageExtractor for a search engine. */ public URLInsetTester(PageExtractor.IExtractor extractor, boolean debug) { this.extractor = extractor; this.debug = debug; } /** * Load a list of urls from a binary list, then call testURLs(). */ public void testURLs(File urlList, TreeSet knownInset, File outputSet) throws IOException { // Load in the URLs whose presence we should test. Vector urls = new Vector(); DataInputStream in = new DataInputStream(new FileInputStream(urlList)); try { int numItems = in.readInt(); for (int i = 0; i < numItems; i++) { urls.add(in.readUTF().trim()); in.readInt(); } } finally { in.close(); } testURLs(urls, knownInset, outputSet); } /** * Just pass in a list of the URLs we want to test. * Go through the list of URLs, outputting whether each * one occurs in the search engine's result list. */ public void testURLs(Vector urls, TreeSet knownInset, File outputSet) throws IOException { // Output the test results DataOutputStream out = new DataOutputStream(new FileOutputStream(outputSet)); try { out.writeInt(urls.size()); for (Enumeration e = urls.elements(); e.hasMoreElements(); ) { String url = (String) e.nextElement(); ArrayList results = null; try { results = extractor.applyQuery(url); } catch (IOException ie) { System.err.println("Could not extract results for " + url); } boolean hasURL = false; if (knownInset != null && knownInset.contains(url)) { hasURL = true; } else if (results != null) { for (Iterator it = results.iterator(); it.hasNext(); ) { String val = (String) it.next(); if (val.trim().compareTo(url) == 0) { hasURL = true; break; } else { if (debug) { System.out.println("Query url " + url + " does not match result " + val); } } } } else { if (debug) { System.out.println("Got no results when searching for " + url); } } out.writeUTF(url); out.writeBoolean(hasURL); if (debug) { System.out.println("For " + url + ": " + hasURL); } } } finally { out.close(); } } /** * Provide this program a target search engine, a set of * URLs, and a place to write the output. */ public static void main(String argv[]) throws IOException, ParseException { if (argv.length < 4) { System.out.println("Usage: java net.nutch.quality.URLInsetTester [-externalengine <pageDesc> <userAgent>] [-nutchengine <segments>] <queryList> <setMembershipResults> [-debug]"); return; } int pos = argv.length; boolean debug = false; String pageDesc = null, userAgent = null, segments = null, queryList = null, outputSet = null; // Parse command if ("-externalengine".equals(argv[0])) { pageDesc = argv[1]; userAgent = argv[2]; pos = 3; } else if ("-nutchengine".equals(argv[0])) { segments = argv[1]; pos = 2; } else { System.out.println("Must use command -externalengine or -nutchengine"); return; } // Get rest of args queryList = argv[pos++]; outputSet = argv[pos++]; if (argv.length > pos && "-debug".equals(argv[pos])) { debug = true; } // Prepare the extractor PageExtractor.IExtractor extractor = null; if ("-externalengine".equals(argv[0])) { extractor = new PageExtractor.RemotePageExtractor(new File(pageDesc), userAgent, debug); } else if ("-nutchengine".equals(argv[0])) { extractor = new PageExtractor.NutchExtractor(segments); } // Load in plaintext urls Vector urls = new Vector(); BufferedReader in = new BufferedReader(new FileReader(queryList)); try { String url = in.readLine(); while (url != null) { urls.add(url); url = in.readLine(); } } finally { in.close(); } // Get results from each, and test them! URLInsetTester uit = new URLInsetTester(extractor, true); uit.testURLs(urls, null, new File(outputSet)); } }